# !apt install git-lfs
# !git lfs install
# !git clone https://huggingface.co/datasets/parambharat/malayalam_asr_corpus
# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg
# !pip uninstall -y transformers datasets
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install "evaluate>=0.3.0"
# !pip install jiwer
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes
# !pip install "bokeh<2.5.0"
# !pip install "holoviews[recommended]"
# !pip install pyarrow
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_small_ml.ipynb
env: WANDB_LOG_MODEL=True env: WANDB_WATCH=all env: WANDB_NOTEBOOK_NAME=whisper_small_ml.ipynb
import torch
from torch.utils.data import IterableDataset
from io import StringIO
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio
from datasets import load_dataset, Audio
import evaluate
from pathlib import Path
import pandas as pd
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE
import jiwer
hv.extension("bokeh", logo=False)
# wandb.login()
# notebook_login()
run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="small-ml")
artifact = run.use_artifact('parambharat/whisper_finetuning/model-2dl0413q:latest', type='model')
artifact_dir = artifact.download()
wandb: Currently logged in as: parambharat. Use `wandb login --relogin` to force relogin
/home/ubuntu/whisper-finetuning/notebooks/wandb/run-20221212_191352-219trr2l
wandb: Downloading large artifact model-2dl0413q:latest, 923.99MB. 10 files... wandb: 10 of 10 files downloaded. Done. 0:0:0.0
def load_data_splits(is_streaming=True, stopping_strategy="all_exhausted"):
dataset_dict = {}
data_dict = load_dataset("../data/malayalam_asr_corpus/", streaming=is_streaming)
return data_dict
dataset_dict = load_data_splits()
augment_waveform = Compose([
AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3, leave_length_unchanged=False),
PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
,])
def augment_dataset(batch):
audio = batch["audio"]["array"]
# apply augmentation
augmented_audio = augment_waveform(samples=audio, sample_rate=16000)
batch["audio"]["array"] = augmented_audio
return batch
# call augment dataset on the training set
dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)
feature_extractor = WhisperFeatureExtractor.from_pretrained(
"openai/whisper-small"
)
tokenizer = WhisperTokenizer.from_pretrained(
"openai/whisper-small",
language="Malayalam",
task="transcribe",
model_max_length=225
)
processor = WhisperProcessor.from_pretrained(
"openai/whisper-small",
language="Malayalam",
task="transcribe",
model_max_length=225
)
def fix_sentence(sentence):
transcription = sentence
if transcription.startswith('"') and transcription.endswith('"'):
# we can remove trailing quotation marks as they do not affect the transcription
transcription = transcription[1:-1]
if transcription[-1] not in [".", "?", "!"]:
# append a full-stop to sentences that do not end in punctuation
transcription = transcription + "."
transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
return transcription
def prepare_dataset(examples):
# compute log-Mel input features from input audio array
audio = examples["audio"]
examples["input_features"] = feature_extractor(
audio["array"], sampling_rate=16000).input_features[0]
sentences = fix_sentence(examples["sentence"])
# encode target text to label ids
examples["labels"] = tokenizer(sentences, max_length=225, truncation=True).input_ids
return examples
def filter_empty_strings(sentence):
if len(sentence) < 2:
return False
else: return True
for k in dataset_dict:
dataset_dict[k] = dataset_dict[k].filter(filter_empty_strings, input_columns=["sentence"])
for k in dataset_dict:
dataset_dict[k] = dataset_dict[k].map(
prepare_dataset,).with_format("torch")
dataset_dict["train"] = dataset_dict["train"].shuffle(buffer_size=500)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need different padding methods
# first treat the audio inputs by simply returning torch tensors
input_features = [{"input_features": feature["input_features"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
# get the tokenized label sequences
label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
for feature in features]
# pad the labels to max length
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")
# evaluate with the 'normalised' WER
do_normalize_eval = True
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
# replace -100 with the pad_token_id
label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
# we do not want to group tokens when computing the metrics
pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)
wer = 100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
model = WhisperForConditionalGeneration.from_pretrained(artifact_dir, use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
if isinstance(train_dataloader.dataset, IterableDatasetShard):
pass # set_epoch() is handled by the Trainer
elif isinstance(train_dataloader.dataset, IterableDataset):
train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
def load_samples_dataset(dataset, num_samples=100):
samples = []
for i, item in enumerate(dataset):
samples.append(item)
if i == (num_samples-1):
break
sample_dataset = Dataset.from_list(samples)
return sample_dataset
def compute_spectrograms(example):
waveform = example["audio"]["array"]
specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
return {"spectrogram": specs}
def record_to_html(sample_record):
audio_array = np.array(sample_record["audio"]["array"])
audio_sr = sample_record["audio"]["sampling_rate"]
audio_duration = sample_record["length"]
audio_spectrogram = np.array(sample_record["spectrogram"])
bounds = (0,0, audio_duration, audio_spectrogram.max())
waveform_int = np.int16(audio_array * 32767)
hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
line_audio = hv.VLine(0).opts(color='black')
line_spec = hv.VLine(0).opts(color='red')
slider.jslink(hv_audio, value='time', bidirectional=True)
slider.jslink(line_audio, value='glyph.location')
slider.jslink(line_spec, value='glyph.location')
time = np.linspace(0, audio_duration, num=len(audio_array))
line_plot_hv = hv.Curve(
(time, audio_array), ["Time (s)", "amplitude"]).opts(
width=500, height=150, axiswise=True) * line_audio
hv_spec_gram = hv.Image(
audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
width=500, height=150, labelled=[], axiswise=True, color_levels=512)* line_spec
combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
audio_html = StringIO()
combined.save(audio_html)
return audio_html
def dataset_to_records(dataset):
records = []
for item in dataset:
record = {}
record["audio_with_spec"] = wandb.Html(record_to_html(item))
record["sentence"] = item["sentence"]
record["length"] = item["length"]
records.append(record)
records = pd.DataFrame(records)
return records
def decode_predictions(trainer, predictions):
pred_ids = predictions.predictions
pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
return pred_str
def compute_measures(predictions, labels):
measures = [jiwer.compute_measures(ls, ps) for ps, ls in zip(predictions, labels)]
measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
return measures_df
class WandbProgressResultsCallback(WandbCallback):
def __init__(self, trainer, sample_dataset):
super().__init__()
self.trainer = trainer
self.sample_dataset = sample_dataset
self.records_df = dataset_to_records(sample_dataset)
def on_log(self, args, state, control, model=None, logs=None, **kwargs):
super().on_log(args, state, control, model, logs)
predictions = trainer.predict(self.sample_dataset)
predictions = decode_predictions(self.trainer, predictions)
measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
records_df = pd.concat([self.records_df, measures_df], axis=1)
records_df["prediction"] = predictions
records_df["step"] = state.global_step
records_table = self._wandb.Table(dataframe=records_df)
self._wandb.log({"sample_predictions": records_table})
def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
if self._wandb is None:
return
if self._log_model and self._initialized and state.is_world_process_zero:
with tempfile.TemporaryDirectory() as temp_dir:
self.trainer.save_model(temp_dir)
metadata = (
{
k: v
for k, v in dict(self._wandb.summary).items()
if isinstance(v, numbers.Number) and not k.startswith("_")
}
if not args.load_best_model_at_end
else {
f"eval/{args.metric_for_best_model}": state.best_metric,
"train/total_floss": state.total_flos,
}
)
artifact = self._wandb.Artifact(
name=f"model-{self._wandb.run.id}",
type="model", metadata=metadata)
for f in Path(temp_dir).glob("*"):
if f.is_file():
with artifact.new_file(f.name, mode="wb") as fa:
fa.write(f.read_bytes())
self._wandb.run.log_artifact(artifact)
training_args = Seq2SeqTrainingArguments(
output_dir="../models/whisper-small-ml", # change to a repo name of your choice
per_device_train_batch_size=64,
gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
learning_rate=1e-5,
save_total_limit=4,
warmup_steps=500,
max_steps=3000,
gradient_checkpointing=True,
fp16=True,
# fp16_full_eval=True,
optim="adamw_bnb_8bit",
evaluation_strategy="steps",
per_device_eval_batch_size=32,
predict_with_generate=True,
generation_max_length=225,
save_steps=500,
eval_steps=500,
logging_steps=250,
report_to="none",
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=True,
hub_strategy="checkpoint",
remove_unused_columns=False,
ignore_data_skip=True
)
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)
0%| | 0/100 [00:00<?, ?ex/s]
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=dataset_dict["train"],
eval_dataset=samples_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor,
callbacks=[ShuffleCallback()],
)
Cloning https://huggingface.co/parambharat/whisper-small-ml into local empty directory.
Download file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
Download file training_args.bin: 100%|##########| 3.48k/3.48k [00:00<?, ?B/s]
Clean file training_args.bin: 29%|##8 | 1.00k/3.48k [00:00<?, ?B/s]
Clean file pytorch_model.bin: 0%| | 1.00k/922M [00:00<?, ?B/s]
max_steps is given, it will override any value given in num_train_epochs Using cuda_amp half precision backend
progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)
clear_output()
trainer.add_callback(progress_callback)
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)
Configuration saved in ../models/whisper-small-ml/config.json Model weights saved in ../models/whisper-small-ml/pytorch_model.bin Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json added tokens file saved in ../models/whisper-small-ml/added_tokens.json
trainer.train()
***** Running training ***** Num examples = 192000 Num Epochs = 9223372036854775807 Instantaneous batch size per device = 64 Total train batch size (w. parallel, distributed & accumulation) = 64 Gradient Accumulation steps = 1 Total optimization steps = 3000 Number of trainable parameters = 241734912 Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
| Step | Training Loss | Validation Loss | Wer |
|---|---|---|---|
| 500 | 0.127500 | 0.163005 | 35.401460 |
| 1000 | 0.090000 | 0.182085 | 40.024331 |
| 1500 | 0.062000 | 0.200353 | 37.712895 |
| 2000 | 0.044100 | 0.210513 | 36.253041 |
| 2500 | 0.033500 | 0.224975 | 37.712895 |
| 3000 | 0.027600 | 0.230829 | 36.739659 |
***** Running Prediction ***** Num examples = 100 Batch size = 32 ***** Running Prediction ***** Num examples = 100 Batch size = 32 ***** Running Evaluation ***** Num examples = 100 Batch size = 32 ***** Running Prediction ***** Num examples = 100 Batch size = 32 Saving model checkpoint to ../models/whisper-small-ml/checkpoint-500 Configuration saved in ../models/whisper-small-ml/checkpoint-500/config.json Model weights saved in ../models/whisper-small-ml/checkpoint-500/pytorch_model.bin Feature extractor saved in ../models/whisper-small-ml/checkpoint-500/preprocessor_config.json tokenizer config file saved in ../models/whisper-small-ml/checkpoint-500/tokenizer_config.json Special tokens file saved in ../models/whisper-small-ml/checkpoint-500/special_tokens_map.json added tokens file saved in ../models/whisper-small-ml/checkpoint-500/added_tokens.json Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json added tokens file saved in ../models/whisper-small-ml/added_tokens.json Saving model checkpoint to /tmp/tmp2wmi9emx Configuration saved in /tmp/tmp2wmi9emx/config.json Model weights saved in /tmp/tmp2wmi9emx/pytorch_model.bin Feature extractor saved in /tmp/tmp2wmi9emx/preprocessor_config.json tokenizer config file saved in /tmp/tmp2wmi9emx/tokenizer_config.json Special tokens file saved in /tmp/tmp2wmi9emx/special_tokens_map.json added tokens file saved in /tmp/tmp2wmi9emx/added_tokens.json Saving model checkpoint to ../models/whisper-small-ml Configuration saved in ../models/whisper-small-ml/config.json Model weights saved in ../models/whisper-small-ml/pytorch_model.bin Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json added tokens file saved in ../models/whisper-small-ml/added_tokens.json Several commits (2) will be pushed upstream. The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt: 0%| | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
5d4a3d1..5d2c21c main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
5d2c21c..40e9491 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Evaluation *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-1000
Configuration saved in ../models/whisper-small-ml/checkpoint-1000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-1000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-1000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-1000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpyxx0nddx
Configuration saved in /tmp/tmpyxx0nddx/config.json
Model weights saved in /tmp/tmpyxx0nddx/pytorch_model.bin
Feature extractor saved in /tmp/tmpyxx0nddx/preprocessor_config.json
tokenizer config file saved in /tmp/tmpyxx0nddx/tokenizer_config.json
Special tokens file saved in /tmp/tmpyxx0nddx/special_tokens_map.json
added tokens file saved in /tmp/tmpyxx0nddx/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt: 0%| | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
40e9491..e003b12 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
e003b12..f25dcc0 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Evaluation *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-1500
Configuration saved in ../models/whisper-small-ml/checkpoint-1500/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-1500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-1500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-1500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-1500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpoawgpo4v
Configuration saved in /tmp/tmpoawgpo4v/config.json
Model weights saved in /tmp/tmpoawgpo4v/pytorch_model.bin
Feature extractor saved in /tmp/tmpoawgpo4v/preprocessor_config.json
tokenizer config file saved in /tmp/tmpoawgpo4v/tokenizer_config.json
Special tokens file saved in /tmp/tmpoawgpo4v/special_tokens_map.json
added tokens file saved in /tmp/tmpoawgpo4v/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
f25dcc0..9426188 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
9426188..1d3689a main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Evaluation *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-2000
Configuration saved in ../models/whisper-small-ml/checkpoint-2000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-2000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-2000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-2000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-2000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpcpk221pg
Configuration saved in /tmp/tmpcpk221pg/config.json
Model weights saved in /tmp/tmpcpk221pg/pytorch_model.bin
Feature extractor saved in /tmp/tmpcpk221pg/preprocessor_config.json
tokenizer config file saved in /tmp/tmpcpk221pg/tokenizer_config.json
Special tokens file saved in /tmp/tmpcpk221pg/special_tokens_map.json
added tokens file saved in /tmp/tmpcpk221pg/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt: 0%| | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
1d3689a..5de9b65 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
5de9b65..9208d97 main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Evaluation *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-2500
Configuration saved in ../models/whisper-small-ml/checkpoint-2500/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-2500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-2500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-2500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-2500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ml/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to /tmp/tmp5t2kkkrw
Configuration saved in /tmp/tmp5t2kkkrw/config.json
Model weights saved in /tmp/tmp5t2kkkrw/pytorch_model.bin
Feature extractor saved in /tmp/tmp5t2kkkrw/preprocessor_config.json
tokenizer config file saved in /tmp/tmp5t2kkkrw/tokenizer_config.json
Special tokens file saved in /tmp/tmp5t2kkkrw/special_tokens_map.json
added tokens file saved in /tmp/tmp5t2kkkrw/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
9208d97..f4c20f9 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
f4c20f9..808e33c main -> main
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
***** Running Evaluation *****
Num examples = 100
Batch size = 32
***** Running Prediction *****
Num examples = 100
Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-3000
Configuration saved in ../models/whisper-small-ml/checkpoint-3000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-3000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-3000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-3000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-3000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ml/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpxn6yu83x
Configuration saved in /tmp/tmpxn6yu83x/config.json
Model weights saved in /tmp/tmpxn6yu83x/pytorch_model.bin
Feature extractor saved in /tmp/tmpxn6yu83x/preprocessor_config.json
tokenizer config file saved in /tmp/tmpxn6yu83x/tokenizer_config.json
Special tokens file saved in /tmp/tmpxn6yu83x/special_tokens_map.json
added tokens file saved in /tmp/tmpxn6yu83x/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
808e33c..ce0c6c5 main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
ce0c6c5..cf560b9 main -> main
Training completed. Do not forget to share your model on huggingface.co/models =)
Loading best model from ../models/whisper-small-ml/checkpoint-500 (score: 35.4014598540146).
***** Running Prediction *****
Num examples = 100
Batch size = 32
/home/ubuntu/whisper-finetuning/notebooks/../models/whisper-small-ml is already a clone of https://huggingface.co/parambharat/whisper-small-ml. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
Saving model checkpoint to /tmp/tmp6d21ms5i
Configuration saved in /tmp/tmp6d21ms5i/config.json
Model weights saved in /tmp/tmp6d21ms5i/pytorch_model.bin
Feature extractor saved in /tmp/tmp6d21ms5i/preprocessor_config.json
tokenizer config file saved in /tmp/tmp6d21ms5i/tokenizer_config.json
Special tokens file saved in /tmp/tmp6d21ms5i/special_tokens_map.json
added tokens file saved in /tmp/tmp6d21ms5i/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
remote: Scanning LFS files for validity, may be slow...
remote: LFS file scan complete.
To https://huggingface.co/parambharat/whisper-small-ml
cf560b9..e388abe main -> main
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
e388abe..acd6019 main -> main
TrainOutput(global_step=3000, training_loss=0.06887441301345826, metrics={'train_runtime': 40824.8754, 'train_samples_per_second': 4.703, 'train_steps_per_second': 0.073, 'total_flos': 5.501245769220096e+19, 'train_loss': 0.06887441301345826, 'epoch': 28.02})
def load_test_dataset(config = "fleurs"):
if config == "fleurs":
fleurs_test = load_dataset("google/fleurs", "ml_in", split="test", use_auth_token=True, streaming=True)
fleurs_test = fleurs_test.rename_column("transcription", "sentence")
fleurs_test = fleurs_test.remove_columns(
[col for col in fleurs_test.features.keys() if col not in ["audio", "sentence"]])
fleurs_test = fleurs_test.cast_column("audio", Audio(sampling_rate=160000))
test_dataset = fleurs_test.map(prepare_dataset,).with_format("torch")
return test_dataset
else:
common_voice_test = load_dataset("mozilla-foundation/common_voice_11_0", "ml", split="test", use_auth_token=True, streaming=True)
common_voice_test = common_voice_test.remove_columns(
[col for col in common_voice_test.features.keys() if col not in ["audio", "sentence"]]).cast_column("audio", Audio(sampling_rate=160000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=160000))
test_dataset = common_voice_test.map(prepare_dataset,).with_format("torch")
return test_dataset
test_dataset = load_test_dataset("common voice")
Downloading builder script: 0%| | 0.00/8.30k [00:00<?, ?B/s]
Downloading readme: 0%| | 0.00/12.2k [00:00<?, ?B/s]
Downloading extra modules: 0%| | 0.00/3.44k [00:00<?, ?B/s]
Downloading extra modules: 0%| | 0.00/60.9k [00:00<?, ?B/s]
trainer.evaluate(test_dataset)
***** Running Evaluation ***** Num examples: Unknown Batch size = 32 Reading metadata...: 112it [00:00, 1358.16it/s]
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[40], line 1 ----> 1 trainer.evaluate(test_dataset) File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer_seq2seq.py:78, in Seq2SeqTrainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, **gen_kwargs) 73 gen_kwargs["num_beams"] = ( 74 gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams 75 ) 76 self._gen_kwargs = gen_kwargs ---> 78 return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer.py:2818, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix) 2815 start_time = time.time() 2817 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop -> 2818 output = eval_loop( 2819 eval_dataloader, 2820 description="Evaluation", 2821 # No point gathering the predictions if there are no metrics, otherwise we defer to 2822 # self.args.prediction_loss_only 2823 prediction_loss_only=True if self.compute_metrics is None else None, 2824 ignore_keys=ignore_keys, 2825 metric_key_prefix=metric_key_prefix, 2826 ) 2828 total_batch_size = self.args.eval_batch_size * self.args.world_size 2829 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer.py:3000, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix) 2997 batch_size = observed_batch_size 2999 # Prediction step -> 3000 loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) 3001 inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None 3003 if is_torch_tpu_available(): File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer_seq2seq.py:198, in Seq2SeqTrainer.prediction_step(self, model, inputs, prediction_loss_only, ignore_keys) 195 else: 196 generation_inputs = inputs[self.model.main_input_name] --> 198 generated_tokens = self.model.generate( 199 generation_inputs, 200 **gen_kwargs, 201 ) 202 # in case the batch is shorter than max length, the output should be padded 203 if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]: File ~/whisper_ft/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs) 24 @functools.wraps(func) 25 def decorate_context(*args, **kwargs): 26 with self.clone(): ---> 27 return func(*args, **kwargs) File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/utils.py:1518, in GenerationMixin.generate(self, inputs, max_length, min_length, do_sample, early_stopping, num_beams, temperature, penalty_alpha, top_k, top_p, typical_p, repetition_penalty, bad_words_ids, force_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, logits_processor, renormalize_logits, stopping_criteria, constraints, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, exponential_decay_length_penalty, suppress_tokens, begin_suppress_tokens, forced_decoder_ids, **model_kwargs) 1513 raise ValueError( 1514 f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." 1515 ) 1517 # 10. run greedy search -> 1518 return self.greedy_search( 1519 input_ids, 1520 logits_processor=logits_processor, 1521 stopping_criteria=stopping_criteria, 1522 pad_token_id=pad_token_id, 1523 eos_token_id=eos_token_id, 1524 output_scores=output_scores, 1525 return_dict_in_generate=return_dict_in_generate, 1526 synced_gpus=synced_gpus, 1527 **model_kwargs, 1528 ) 1530 elif is_contrastive_search_gen_mode: 1532 if num_return_sequences > 1: File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/utils.py:2298, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs) 2295 next_token_logits = outputs.logits[:, -1, :] 2297 # pre-process distribution -> 2298 next_tokens_scores = logits_processor(input_ids, next_token_logits) 2300 # Store scores, attentions and hidden_states when required 2301 if return_dict_in_generate: File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/logits_process.py:92, in LogitsProcessorList.__call__(self, input_ids, scores, **kwargs) 90 scores = processor(input_ids, scores, **kwargs) 91 else: ---> 92 scores = processor(input_ids, scores) 93 return scores File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/logits_process.py:733, in SuppressTokensLogitsProcessor.__call__(self, input_ids, scores) 732 def __call__(self, input_ids, scores): --> 733 scores[:, self.suppress_tokens] = -float("inf") 734 return scores KeyboardInterrupt:
kwargs = {
# "dataset_tags": "mozilla-foundation/common_voice_11_0",
# "dataset": "Common Voice 11.0", # a 'pretty' name for the training dataset
"language": "ml",
"model_name": "Whisper Small ML - Bharat Ramanathan", # a 'pretty' name for your model
"finetuned_from": "openai/whisper-small",
"tasks": "automatic-speech-recognition",
"tags": "whisper-event",
}
trainer.push_to_hub(**kwargs)
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'metrics': [{'name': 'Wer', 'type': 'wer', 'value': 36.73965936739659}]}
To https://huggingface.co/parambharat/whisper-small-ml
acd6019..973cfdb main -> main
wandb.finish()
VBox(children=(Label(value='5702.348 MB of 5702.348 MB uploaded (166.992 MB deduped)\r'), FloatProgress(value=…
| eval/loss | ▁▃▅▆▇█ |
| eval/runtime | ▃█▅▇▄▁ |
| eval/samples_per_second | ▆▁▄▂▅█ |
| eval/steps_per_second | ▁▁▁▁▁█ |
| eval/wer | ▁█▅▂▅▃ |
| train/epoch | ▁▂▂▂▃▃▃▄▄▅▆▆▆▇▇▇███ |
| train/global_step | ▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███████ |
| train/learning_rate | ▅█▇▇▆▅▅▄▃▂▂▁ |
| train/loss | █▇▆▅▄▃▃▂▂▁▁▁ |
| train/total_flos | ▁ |
| train/train_loss | ▁ |
| train/train_runtime | ▁ |
| train/train_samples_per_second | ▁ |
| train/train_steps_per_second | ▁ |
| eval/loss | 0.23083 |
| eval/runtime | 149.6244 |
| eval/samples_per_second | 0.668 |
| eval/steps_per_second | 0.027 |
| eval/wer | 36.73966 |
| train/epoch | 28.02 |
| train/global_step | 3000 |
| train/learning_rate | 0.0 |
| train/loss | 0.0276 |
| train/total_flos | 5.501245769220096e+19 |
| train/train_loss | 0.06887 |
| train/train_runtime | 40824.8754 |
| train/train_samples_per_second | 4.703 |
| train/train_steps_per_second | 0.073 |
./wandb/run-20221212_191352-219trr2l/logs
from evaluate import push_to_hub
push_to_hub?